In [1]:
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
In [2]:
data=pd.read_csv('weatherAUS.csv')
In [3]:
data.shape
Out[3]:
(145460, 23)
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null   float64
 18  Cloud3pm       86102 non-null   float64
 19  Temp9am        143693 non-null  float64
 20  Temp3pm        141851 non-null  float64
 21  RainToday      142199 non-null  object 
 22  RainTomorrow   142193 non-null  object 
dtypes: float64(16), object(7)
memory usage: 25.5+ MB
In [5]:
#drop raws where RainAToday and RainToday has null values
data.dropna(subset=['RainToday','RainTomorrow'],inplace=True)
In [6]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize']=(10,6)
matplotlib.rcParams['figure.facecolor']='#00000000'
In [7]:
print("Unique locations and their counts :")
location = data["Location"].value_counts()
location = location.reset_index()
location.columns= ["Location", "Counts"]
print(location)
Unique locations and their counts :
            Location  Counts
0           Canberra    3402
1             Sydney    3331
2              Perth    3193
3             Darwin    3192
4             Hobart    3183
5           Brisbane    3132
6            Bendigo    3030
7         Townsville    3027
8       AliceSprings    3025
9       MountGambier    3022
10        Launceston    3020
11          Adelaide    3020
12          Ballarat    3017
13      PerthAirport    3009
14  MelbourneAirport    3009
15           Mildura    3005
16     SydneyAirport    3001
17            Albany    2996
18         Nuriootpa    2996
19              Sale    2992
20            Albury    2991
21          Watsonia    2990
22           Woomera    2984
23          Portland    2984
24             Cobar    2980
25            Cairns    2964
26       Tuggeranong    2959
27        WaggaWagga    2958
28     NorfolkIsland    2944
29        Wollongong    2943
30        SalmonGums    2941
31      CoffsHarbour    2940
32          Dartmoor    2939
33         Newcastle    2929
34       Witchcliffe    2924
35         GoldCoast    2924
36           Penrith    2911
37          Richmond    2906
38         NorahHead    2888
39     BadgerysCreek    2877
40       MountGinini    2816
41             Moree    2791
42           Walpole    2770
43        PearceRAAF    2646
44       Williamtown    2376
45         Melbourne    2298
46              Nhil    1565
47         Katherine    1545
48             Uluru    1502
In [8]:
fig = px.treemap(location, path=['Location'], values='Counts', color='Counts', title="Arrangement Of Locations By Counts From Highest To Lowest")
fig.show()
In [9]:
px.histogram(data,x="Location",title="Location vs Rainy Days",color="RainToday")
In [10]:
#data['RainTomorrow'].value_counts()
sns.countplot(x=data['RainTomorrow'])
Out[10]:
<Axes: xlabel='RainTomorrow', ylabel='count'>
In [11]:
sns.boxplot(x='RainTomorrow', y='Temp3pm', data=data) 
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
In [12]:
Today_Tomo = pd.crosstab(data['RainToday'], data['RainTomorrow'])
Today_Tomo.plot(kind='bar',stacked=False)
plt.xlabel('Rain Today')
plt.ylabel('Count')
plt.title('Rain Today - Rain Tomorrow')
Out[12]:
Text(0.5, 1.0, 'Rain Today - Rain Tomorrow')
In [13]:
px.strip(data.sample(2000),title='Temp 3pm vs Humidity 3 pm',x='Temp3pm',y="Humidity3pm",color="RainTomorrow")
In [14]:
px.histogram(data,x='Temp3pm',title="Temperature at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
In [15]:
 px.histogram(data,x='Humidity3pm',title="Humidity at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
In [16]:
Today_Tomo = pd.crosstab(data['Cloud3pm'], data['RainTomorrow'])
row_sums = Today_Tomo.sum(axis=1)
Today_Tomo_percentage = Today_Tomo.div(row_sums, axis=0) * 100
Today_Tomo_percentage.plot(kind='bar', stacked=True)
plt.xlabel('Rain Today')
plt.ylabel('Percentage')
plt.title('Rain Today - Rain Tomorrow (100% Stacked)')

# Show the graph
plt.show()
In [17]:
 px.histogram(data,x='Sunshine',title="Sunny Hours vs. Rain Tomorrow",color='RainTomorrow')
In [18]:
px.histogram(data,x='Pressure3pm',title="Pressure at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
In [19]:
#px.histogram(data,x='WindGustSpeed',title="Pressure at 3 pm vs. Rain Tomorrow",color='RainTomorrow')

Today_Tomo = pd.crosstab(data['WindGustSpeed'], data['RainTomorrow'])
row_sums = Today_Tomo.sum(axis=1)
Today_Tomo_percentage = Today_Tomo.div(row_sums, axis=0) * 100
Today_Tomo_percentage.plot(kind='bar', stacked=True)
plt.xlabel('WindGustSpeed')
plt.ylabel('Percentage')
plt.title('WindGustSpeed - Rain Tomorrow (100% Stacked)')

# Show the graph
plt.show()
In [20]:
sns.boxplot(x='RainTomorrow', y='WindGustSpeed', data=data) 
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
In [21]:
correlation = data.corr()
plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap of Rain in Australia Dataset')
ax = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
plt.show()
C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\4212842004.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [22]:
plt.figure(figsize=(10,10))
sns.heatmap(correlation[(correlation>0.6) |(correlation<-0.6)],annot=True,fmt='.2f')
Out[22]:
<Axes: >
In [23]:
sns.boxplot(x='RainTomorrow', y='MaxTemp', data=data) 
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
In [24]:
fig, ax = plt.subplots(4, 2, figsize=(15,25))

# WindSpeed9am
sns.distplot(data['WindSpeed9am'], ax=ax[0,0], color='green')
ax[0,0].set_title("Wind Speed at 9AM", fontsize=15)

# WindSpeed3pm
sns.distplot(data['WindSpeed3pm'], ax=ax[0,1], color='green')
ax[0,1].set_title("Wind Speed at 3PM", fontsize=15)

# Humidity9am
sns.distplot(data['Humidity9am'], ax=ax[1,0], color='orange')
ax[1,0].set_title("Humidity at 9AM", fontsize=15)

# Humidity3pm
sns.distplot(data['Humidity3pm'], ax=ax[1,1], color='orange')
ax[1,1].set_title("Humidity at 3PM", fontsize=15)

# Pressure9am
sns.distplot(data['Pressure9am'], ax=ax[2,0], color='red')
ax[2,0].set_title("Pressure at 9AM", fontsize=15)

# Pressure3pm
sns.distplot(data['Pressure3pm'], ax=ax[2,1], color='red')
ax[2,1].set_title("Pressure at 3PM", fontsize=15)

# Temp9am
sns.distplot(data['Temp9am'], ax=ax[3,0], color='blue')
ax[3,0].set_title("Temperature at 9AM", fontsize=15)

# Temp3pm
sns.distplot(data['Temp3pm'], ax=ax[3,1], color='blue')
ax[3,1].set_title("Temperature at 3PM", fontsize=15)
C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:4: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:8: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:12: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:16: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:20: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:24: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:28: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:32: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Out[24]:
Text(0.5, 1.0, 'Temperature at 3PM')
In [25]:
from sklearn.model_selection import train_test_split
In [26]:
train_val_df,test_df=train_test_split(data,test_size=0.2,random_state=50)
train_df,val_df=train_test_split(train_val_df,test_size=0.25,random_state=50)
In [27]:
print ( 'train :',train_df.shape)
print ( 'test :',test_df.shape)
print ( 'val :',val_df.shape)
train : (84471, 23)
test : (28158, 23)
val : (28158, 23)
In [28]:
plt.title('No. of Rows Per Year')
sns.countplot(x=pd.to_datetime(data.Date).dt.year)
Out[28]:
<Axes: title={'center': 'No. of Rows Per Year'}, xlabel='Date', ylabel='count'>
In [29]:
plt.title('No. of Rows Per Year')
sns.countplot(x=pd.to_datetime(data.Date).dt.month)
Out[29]:
<Axes: title={'center': 'No. of Rows Per Year'}, xlabel='Date', ylabel='count'>